Skip to content

Conversation

@linuxrocks123
Copy link
Contributor

@linuxrocks123 linuxrocks123 commented Oct 23, 2025

This PR optimizes the pattern bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 on AMDGPU. It also creates a Clang builtin for s_bcnt0_i32 so that users can call this instruction directly instead of relying on the compiler to match this pattern.

@linuxrocks123 linuxrocks123 marked this pull request as ready for review October 23, 2025 19:20
@llvmbot llvmbot added clang Clang issues not falling into any other category backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" llvm:ir labels Oct 23, 2025
@llvmbot
Copy link
Member

llvmbot commented Oct 23, 2025

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-clang

Author: Patrick Simmons (linuxrocks123)

Changes

This PR optimizes the pattern bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 on AMDGPU. It also creates a Blang builtin for s_bcnt0_i32 so that users can call this instruction directly instead of relying on the compiler to match this pattern.


Full diff: https://github.com/llvm/llvm-project/pull/164847.diff

5 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+3)
  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+8)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+43)
  • (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+6-2)
  • (modified) llvm/test/CodeGen/AMDGPU/s_cmp_0.ll (+17-21)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 8428fa97fe445..f17156f8a24ab 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,6 +63,9 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
 BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
 BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
 
+BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc")
+BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc")
+
 TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst")
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 9e334d4316336..50b43a1c927ce 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2359,6 +2359,14 @@ def int_amdgcn_mbcnt_hi :
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem]>;
 
+def int_amdgcn_bcnt032_lo :
+  ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">,
+  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+def int_amdgcn_bcnt064_lo :
+  ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">,
+  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+
 // llvm.amdgcn.ds.swizzle src offset
 def int_amdgcn_ds_swizzle :
   ClangBuiltin<"__builtin_amdgcn_ds_swizzle">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8e35ba77d69aa..39b558694edf8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
@@ -35,6 +36,7 @@
 #include "llvm/Support/KnownFPClass.h"
 #include "llvm/Transforms/Utils/IntegerDivision.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include <cstdint>
 
 #define DEBUG_TYPE "amdgpu-codegenprepare"
 
@@ -93,6 +95,13 @@ static cl::opt<bool> DisableFDivExpand(
   cl::ReallyHidden,
   cl::init(false));
 
+// Disable processing of fdiv so we can better test the backend implementations.
+static cl::opt<bool>
+    DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0",
+                 cl::desc("Prevent transforming bitsin(typeof(x)) - "
+                          "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"),
+                 cl::ReallyHidden, cl::init(false));
+
 class AMDGPUCodeGenPrepareImpl
     : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
 public:
@@ -258,6 +267,7 @@ class AMDGPUCodeGenPrepareImpl
   bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
 
   bool visitIntrinsicInst(IntrinsicInst &I);
+  bool visitCtpop(IntrinsicInst &I);
   bool visitFMinLike(IntrinsicInst &I);
   bool visitSqrt(IntrinsicInst &I);
   bool run();
@@ -1910,6 +1920,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
     return visitFMinLike(I);
   case Intrinsic::sqrt:
     return visitSqrt(I);
+  case Intrinsic::ctpop:
+    return visitCtpop(I);
   default:
     return false;
   }
@@ -1977,6 +1989,37 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
   return insertValues(Builder, FractArg->getType(), ResultVals);
 }
 
+bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
+  uint32_t BitWidth, DestinationWidth, IntrinsicWidth;
+  if (!I.hasOneUse() ||
+      !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth()))
+    return false;
+
+  BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back());
+  if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub)
+    return false;
+
+  ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0));
+  if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth)
+    return false;
+
+  IRBuilder<> Builder(MustBeSub);
+  Instruction *TransformedIns =
+      Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo
+                                            : Intrinsic::amdgcn_bcnt032_lo,
+                              {}, {I.getArgOperand(0)});
+
+  if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) !=
+      (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth()))
+    TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc(
+        TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth)));
+
+  MustBeSub->replaceAllUsesWith(TransformedIns);
+  TransformedIns->takeName(MustBeSub);
+  MustBeSub->eraseFromParent();
+  return true;
+}
+
 bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
   Value *FractArg = matchFractPat(I);
   if (!FractArg)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b621fe78..29104d33a8aa8 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -264,8 +264,12 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
 } // End isReMaterializable = 1, isAsCheapAsAMove = 1
 
 let Defs = [SCC] in {
-def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
-def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
+def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
+  [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))]
+>;
+def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64",
+  [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))]
+>;
 def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
   [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
 >;
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index dd5f838b4a206..db030d2b19d90 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -444,16 +444,14 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) {
 define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
 ; CHECK-LABEL: bcnt032:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_bcnt1_i32_b32 s0, s0
-; CHECK-NEXT:    s_sub_i32 s0, 32, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
   %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
   %result2 = sub i32 32, %result
   call void asm "; use $0", "s"(i32 %result2)
@@ -465,17 +463,15 @@ define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
 define amdgpu_ps i32 @bcnt064(i64 inreg %val0) {
 ; CHECK-LABEL: bcnt064:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; CHECK-NEXT:    s_sub_u32 s0, 64, s0
-; CHECK-NEXT:    s_subb_u32 s1, 0, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:1]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1]
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
   %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
   %result2 = sub i64 64, %result
   call void asm "; use $0", "s"(i64 %result2)

@@ -93,6 +95,13 @@ static cl::opt<bool> DisableFDivExpand(
cl::ReallyHidden,
cl::init(false));

// Disable processing of fdiv so we can better test the backend implementations.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment needs to be updated.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Comment on lines 2362 to 2363
def int_amdgcn_bcnt032_lo :
ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is "bcnt032_lo" the name we want to use? For comparison:

  ClangBuiltin<"__builtin_amdgcn_sad_u8">,
  ClangBuiltin<"__builtin_amdgcn_msad_u8">,

follows the mnemonic without an initial "v_".

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@LU-JOHN I removed the 0, which I feel makes them close to the existing __builtin_amdgcn_mbcnt_lo. What do you think?

Re the option, I think most of the other transformations have options, so I feel this one should as well.

@github-actions
Copy link

github-actions bot commented Oct 23, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should not introduce a builtin or intrinsic. This can be purely done in tablegen patterns without the intermediate step

; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: ; return to shader part epilog
%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like missing tests? What happens in VALU cases? Negative tests for multiple uses of the popcnt?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should not introduce a new intrinsic and only needs a tablegen pattern. The one benefit you get out doing this fold in the IR would be sinking a popcnt out of block, but you can do that just by handling this case in isProfitableToSinkOperands, and that avoids teaching all of the known bits / sign bits / simplify demanded bits about this

DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem]>;

def int_amdgcn_bcnt32_lo :
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not need an intrinsic. This doesn't help match the pattern in any cases, and introduces new support burdens to every optimization

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't seem to be removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jayfoad there is no change to Clang introducing an intrinsic now. What are you referring to?

Copy link
Contributor Author

@linuxrocks123 linuxrocks123 Oct 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@arsenm were you referring to the Clang intrinsic or all code in IntrinsicsAMDGPU.td? If you do not want this file changed, how can I perform the optimization in AMDGPUCodeGenPrepare.cpp without referring to Intrinsic::amdgcn_bcnt32_lo and Intrinsic::amdgcn_bcnt64_lo?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Intrinsic" means the LLVM IR special function llvm.amdgcn.bcnt32.lo(). "Builtin" means the Clang/C++ special function __builtin_amdgcn_bcnt32_lo().

Copy link
Contributor

@jayfoad jayfoad left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good overall.

@@ -1884,6 +1886,13 @@ def : GCNPat <
(S_MOV_B32 (i32 0)), sub1))
>;

def : GCNPat <
(i64 (UniformBinFrag<sub> 64, (UniformUnaryFrag<ctpop> i64:$src))),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

@@ -264,7 +264,9 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
} // End isReMaterializable = 1, isAsCheapAsAMove = 1

let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
[(set i32:$sdst, (UniformBinFrag<sub> 32, (UniformUnaryFrag<ctpop> i32:$src0)))]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need both UniformFrags. Standard practice is to put it only on the outermost expression:

Suggested change
[(set i32:$sdst, (UniformBinFrag<sub> 32, (UniformUnaryFrag<ctpop> i32:$src0)))]
[(set i32:$sdst, (UniformBinFrag<sub> 32, (ctpop i32:$src0)))]

(In this particular case I think it would also work to put it only on the inner expression, but let's not do that.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Comment on lines 467 to 468
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a follow up, it would be good if we could somehow generate a 32-bit compare instead here:

Suggested change
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cmp_lg_u32 s0, 0

Copy link
Contributor Author

@linuxrocks123 linuxrocks123 Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jayfoad yes, I noticed this, too. I think what we really should do here is reverse the order of the MOV and the BCNT instructions. If we did that, we could eliminate the comparison instruction entirely since BCNT already updates SCC.

I think that work belongs in a separate PR that runs on the Machine IR shortly after ISel. Such a pass would help here, but it may also catch other opportunities unrelated to this one.

@@ -625,3 +622,111 @@ if:
endif:
ret i32 1
}

define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s_cmp_0.ll was intended to test deleting redundant s_cmp* sX, 0 instructions. These new bcnt0* tests should be in a different file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

@LU-JOHN LU-JOHN changed the title Match bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 on AMDGPU [AMDGPU] Match bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 Oct 30, 2025
Copy link
Contributor

@LU-JOHN LU-JOHN left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@LU-JOHN LU-JOHN requested a review from arsenm October 30, 2025 14:29
%cmp = icmp ne i64 %result2, 0
%zext = zext i1 %cmp to i32
ret i32 %zext
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing newline at end of file

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

@@ -0,0 +1,110 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a GlobalISel run line to check that the patterns work there too?

Copy link
Contributor Author

@linuxrocks123 linuxrocks123 Oct 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jayfoad, the negative tests crash Global ISel, so I can't add a check unless I break out the positive tests to a separate file. I'll do that if you like, but I think a better approach would be to file a JIRA issue to look into that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised anything here breaks globalisel. I'd expect this of all things to work better. What is the crash?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@arsenm

LLVM ERROR: unable to map instruction: %8:sreg_32 = COPY %7:vgpr(s32) (in function: bcnt032_not_for_vregs)

It requires the test WITHOUT your suggested changes to cause the crash. I updated them per your comments, but please let me know if you'd like me to revert that commit to preserve the crashing testcase. (I can add an XFAIL line.)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is failing on the broken v-to-s copy for the asm input. I think this is the global isel flavored version of the case the DAG path miscompiles into ignoring the SGPR constraint

Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Description is out of date

Comment on lines 44 to 35
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use v[5:6]
; CHECK-NEXT: ;;#ASMEND
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh this is bad bug. Your SGPR constraint was lost and silently transmuted into a VGPR. Not related to this PR though, for your purposes you're just using an overly complicated test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yikes! Do you want to file the issue or should I?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably a duplicate of #89971

; CHECK-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
%val0 = load volatile i64, ptr addrspace(1) %gep
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

volatile load is ineligible for the VALU load to scalar load optimization. For your purposes, it is simpler to use an inreg argument to the shader calling convention rather than all of this boilerplate to load the value from memory

Copy link
Contributor Author

@linuxrocks123 linuxrocks123 Nov 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think inreg is for non-divergent inputs. I wanted vector bcnt to make sure this PR does NOT attempt to optimize vector bcnt instructions, because it shouldn't be. I did restructure them to get rid of the memory stuff, though.

; CHECK-NEXT: ; use s[2:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; return to shader part epilog
%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
%result = call i64 @llvm.ctpop.i64(i64 %val0)

Don't need the callsite attributes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s

define amdgpu_ps i32 @bcnt032_not_for_vregs(i64 %val) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why this is using a 64-bit input, this will work just as well with i32?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

@@ -0,0 +1,110 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised anything here breaks globalisel. I'd expect this of all things to work better. What is the crash?

%val0 = trunc i64 %val to i32
%result = call i32 @llvm.ctpop.i32(i32 %val0)
%result2 = sub i32 32, %result
call void asm "; use $0", "s"(i32 %result2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is still hitting that asm bug, but not sure why you're using the asm here in the first place. You just need a VGPR return?

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s

define amdgpu_ps i32 @bcnt032_not_for_vregs(i64 %val) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you just want to test VGPR input + VGPR output, drop the amdgpu_ps

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

; CHECK-NEXT: ; return to shader part epilog
%result = call i64 @llvm.ctpop.i64(i64 %val0)
%result2 = sub i64 64, %result
call void asm "; use $0", "s"(i64 %result2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comments as previous function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

@linuxrocks123
Copy link
Contributor Author

@arsenm LGTM?

@linuxrocks123
Copy link
Contributor Author

@arsenm just saw your testcase comments. One sec.

@linuxrocks123
Copy link
Contributor Author

@arsenm okay, comments dealt with. If you'd like to preserve the global isel error, I should revert or partially revert the last commit. Otherwise, LGTM?

let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
[(set i32:$sdst, (UniformBinFrag<sub> 32, (ctpop i32:$src0)))]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
[(set i32:$sdst, (UniformBinFrag<sub> 32, (ctpop i32:$src0)))]
[(set i32:$sdst, (UniformBinFrag<sub> 32, (ctpop_oneuse i32:$src0)))]

Will need to put in AMDGPUInstructions.td

def ctpop_oneuse : HasOneUseUnaryOp<ctpop>;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@@ -0,0 +1,110 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is failing on the broken v-to-s copy for the asm input. I think this is the global isel flavored version of the case the DAG path miscompiles into ignoring the SGPR constraint

Comment on lines +39 to +40
; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0
; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure we should do this in the multiple use case. It's not worse, but it's trading for an equivalently good instruction

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@arsenm I thought so, too, but they're actually not equivalent because of SCC. The multiple use transformation has the potential to get rid of an unnecessary compare instruction because s_bcnt0 will set SCC to 1 when the result is nonzero. Here's the master versus branch diff for bcnt032_ctpop_multiple_uses:

@@ -105,14 +105,13 @@
 	.type	bcnt032_ctpop_multiple_uses,@function
 bcnt032_ctpop_multiple_uses:            ; @bcnt032_ctpop_multiple_uses
 ; %bb.0:
-	s_bcnt1_i32_b32 s0, s0
-	s_sub_i32 s1, 32, s0
-	s_cmp_lg_u32 s1, 0
+	s_bcnt1_i32_b32 s1, s0
+	s_bcnt0_i32_b32 s0, s0
 	;;#ASMSTART
-	; use s0
+	; use s1
 	;;#ASMEND
 	;;#ASMSTART
-	; use s1
+	; use s0
 	;;#ASMEND
 	s_cselect_b64 s[0:1], -1, 0
 	v_cndmask_b32_e64 v0, 0, 1, s[0:1]

bcnt064_ctpop_multiple_uses does not demonstrate an improvement versus master in instruction count, but I think it is possible to save a compare instruction there as well sometimes.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" clang Clang issues not falling into any other category llvm:ir

Projects

None yet

Development

Successfully merging this pull request may close these issues.

6 participants